# -*- coding: utf-8 -*-
'''
Created on 2016年12月22日

@author: ZhuJiahui
'''

import os
import time
from file_utils.file_reader import read_to_2d_list, read_to_2_list
from file_utils.file_writer import write_1d_to_text
from topic_utils.distribution_util import get_real_topics
from topic_utils.lda import gensim_lda


def weibo_lda_process(latent_topic_number, read_directory, write_directory1, write_directory2, write_directory3):
    '''
    微博LDA过程 针对文件夹下的每一个文本
    :param latent_topic_number: 潜在主题个数
    :param read_directory: 文件路径
    :param write_directory1: 文档-主题分布结果路径
    :param write_directory2: 主题-词汇分布结果路径
    :param write_directory3: 主题词结果路径
    :return 每一个文本的LDA建模过程的运行时间
    '''
    
    file_number = len(os.listdir(read_directory))
    run_time = []
    delimiter = " "
    
    sp = 20
    # sp2 = 3

    for i in range(182, 207):

        contents_in_id = read_to_2d_list(read_directory + '/' + str(i + 1) + '/' + str(i + 1) + '.docs', delimiter)
        plain_word_id_list, word_list = read_to_2_list(read_directory + '/' + str(i + 1) + '/' + str(i + 1) + '.vocab', delimiter)

        # 运行gensim LDA并计时
        start_time = time.clock()
        _, THETA, PHAI = gensim_lda(contents_in_id, plain_word_id_list, latent_topic_number, topic_words_to_print=sp)
        this_time = time.clock() - start_time
        run_time.append(str(this_time))

        # 分布稀疏化
        # SPTW_vsm = sparse_DTvsm(PHAI, sp)
        # SPDT_vsm = sparse_DTvsm(THETA, sp2)
        
        # 真实的主题词短语
        real_topics = get_real_topics(PHAI, word_list)
        
        PHAI_to_string = []
        for j in range(len(PHAI)):
            str_line = " ".join([str(x) for x in PHAI[j]])
            PHAI_to_string.append(str_line)
        
        THETA_to_string = []
        for j in range(len(THETA)):
            str_line = " ".join([str(x) for x in THETA[j]])
            THETA_to_string.append(str_line)
            
        write_1d_to_text(write_directory1 + '/' + str(i + 1) + '.txt', THETA_to_string)
        write_1d_to_text(write_directory2 + '/' + str(i + 1) + '.txt', PHAI_to_string)
        write_1d_to_text(write_directory3 + '/' + str(i + 1) + '.txt', real_topics)
        
        print(this_time)
        
    return run_time


if __name__ == '__main__':
    
    now_directory = os.getcwd()
    root_directory = os.path.dirname(now_directory) + '/'
    read_directory = root_directory + 'dataset/text_model/tm_corpus'
    
    latent_topic_number = 100
    write_directory = root_directory + 'dataset/LDA'
    write_directory1 = root_directory + 'dataset/LDA/feed_topic' + str(latent_topic_number)
    write_directory2 = root_directory + 'dataset/LDA/topic_word' + str(latent_topic_number)
    write_directory3 = root_directory + 'dataset/LDA/real_topic' + str(latent_topic_number)
    
    if (not(os.path.exists(write_directory))):
        os.mkdir(write_directory)
    if (not(os.path.exists(write_directory1))):
        os.mkdir(write_directory1)
    if (not(os.path.exists(write_directory2))):
        os.mkdir(write_directory2)
    if (not(os.path.exists(write_directory3))):
        os.mkdir(write_directory3)
    
    
    run_time = weibo_lda_process(latent_topic_number, read_directory, write_directory1, write_directory2, write_directory3)
    write_1d_to_text(write_directory + '/time' + str(latent_topic_number) + '.txt', run_time)
    
    print('微博LDA过程结束.')
    